import requests
from bs4 import BeautifulSoup
import time
import csv

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

base_url = "https://tieba.baidu.com/p/9181762031?pn={}"
page_number = 1

with open('../tieba_replies.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['发帖人', '发帖时间', '发帖内容', '回复人', '回复时间', '回复内容'])

    while True:
        response = requests.get(base_url.format(page_number), headers=headers)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'lxml')

        # 获取主帖信息（仅第一页）
        if page_number == 1:
            title = soup.find('h3', {'class': 'core_title_txt'}).text.strip()
            main_post = soup.find('div', {'class': 'd_post_content'}).text.strip()
            author = soup.find('li', {'class': 'd_name'}).text.strip()
            post_time = soup.find('span', {'class': 'tail-info'}).text.strip()
            writer.writerow([author, post_time, main_post])

        # 获取跟帖信息
        replies = soup.find_all('div', {'class': 'l_post'})
        if not replies:
            break

        for reply in replies:
            reply_content = reply.find('div', {'class': 'd_post_content'}).text.strip()
            reply_author = reply.find('li', {'class': 'd_name'}).text.strip()
            reply_time = reply.find_all('span', {'class': 'tail-info'})[-1].text.strip()

            writer.writerow([None, None, None, reply_author, reply_time, reply_content])

        page_number += 1
        time.sleep(2)  # 延时以防被封
